Analyse CO1 data

Diversity within each sample

To assess the diversity within samples, we did a clustering based on distance (<0.10). Each cluster is summarized by a representative sequence (the longest).

library(dendextend)
library(Biostrings)
library(DECIPHER)

get_clusters_and_consensus=function(file, cutoff=0.1){
  seqs=readDNAStringSet(file)
  seqs=RemoveGaps(seqs)
  alignement=AlignSeqs(seqs, processors = NULL)
  d=DistanceMatrix(myXStringSet = alignement, correction = "Jukes-Cantor", processors = NULL, penalizeGapGapMatches=FALSE, penalizeGapLetterMatches=F)
  pdf( file = paste0(file, ".pdf"))
  clusters=IdClusters(d, cutoff = cutoff,showPlot = T, method = "NJ")
  title(main = file)
  dev.off()
  
  n_clusters=max(clusters$cluster)

  # get the longest of each cluster
  representant_liste=list()
  for (i in 1:n_clusters){
    aln=alignement[which(x = clusters$cluster==i)]
    n_sequ=length(aln)
    if (n_sequ==1){
      representant_liste[[i]]=aln[1]
    }
    if (n_sequ>1){
    # get the length of each
    length=apply(alphabetFrequency(aln)[,1:4], 1, sum)
    # choose the longest
    longest=which(length==max(length))[1]
    #consensus_i=ConsensusSequence(aln)
    representant_i=aln[longest]
    representant_liste[[i]]=representant_i
    }
    # change name
    base_name=lapply(strsplit(file, split="_"), FUN = function(x){
    res=paste0(x[1], "_", x[2])})
    names(representant_liste[[i]])=paste0(base_name,"_c", i, "_n=", n_sequ)
    }
  # save the representant per clusters
  representants = unlist(representant_liste)
  representants = do.call(c, representants)
  writeXStringSet(x = reverseComplement(representants), filepath  = paste0(file, "_representants.fa"))
  
  # save the alignment ordered by cluster (reverse complement to be in the correct orientation)
  read=c(1:dim(clusters)[1])
  clusters=data.frame(read, clusters)
  clusters=clusters[order(clusters$cluster, decreasing = F),]
  alignement_ordered=alignement[clusters$read]
  names(alignement_ordered)=paste0(clusters$read, "_c", clusters$cluster)
  writeXStringSet(x = reverseComplement(alignement_ordered), filepath  = paste0(file, "_ordered.aln"))
  
  return(representant_liste)
}


###################################################################################################
# list of files to analyze
ls=list.files(path = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/", pattern = "*.fasta_short.fa$")
setwd(dir = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/")


consensus_per_cluster_liste=list()
for (i in 1:length(ls)){
  print(paste0("sample number:", i," ", ls[i]))
  consensus_per_cluster_liste[[i]]=get_clusters_and_consensus(file  = ls[i], cutoff = 0.1)
}

length(consensus_per_cluster_liste)
consensus_per_cluster_aln = unlist(consensus_per_cluster_liste)
consensus_per_cluster_aln = do.call(c, consensus_per_cluster_aln)
# remove gaps
consensus_per_cluster_aln=RemoveGaps(consensus_per_cluster_aln)
# align
consensus_per_cluster_aln = AlignSeqs(myXStringSet = consensus_per_cluster_aln)

#######################################
# write alignments
#######################################
writeXStringSet(consensus_per_cluster_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")

# looking at the alignement, it is clear that some sequences have artifacts (stretches of C):
to_remove=read.table("consensus_bad_sequences_to_remove2.txt")
# filter out these ones
consensus_per_cluster_aln=consensus_per_cluster_aln[-which(names(consensus_per_cluster_aln) %in% to_remove$V1)]

writeXStringSet(consensus_per_cluster_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")

BOLD analysis

each representative sequence was then blasted against BOLD database, on the webserver http://www.boldsystems.org/index.php/IDS_IdentificationRequest/view?jobId=varaldi.identificationRequest_varaldi_CBC5D071-B542-4CF3-905C-6937D69AC7E9.H:bold_jobserver-vm:15440606#54_Pachy_c1_n=5

bold=read.table("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/IDEngine_Results_Summary.txt", h=T, sep="\t")
dim(bold)

# remove bad sequences 
bold=bold[-which(bold$Query.ID %in% to_remove$V1),]
dim(bold)
head(bold)

# split first column
names=strsplit(as.character(bold$Query.ID), "_")
names=lapply(X = names, FUN=function(x){
  sample=x[1]
  species=x[2]
  cluster=as.numeric(sub(pattern = "c", replacement = "", x = x[3]))
  nreads=as.numeric(sub(pattern = "n=", replacement = "", x = x[4]))
  res=data.frame(sample, species, cluster, nreads)
  return(res)
})
names=do.call(rbind.data.frame, names)

bold=data.frame(names, bold)
head(bold)

#species identified by BOLD:

levels(bold$Best.ID)

Some reads correspond to unspecific amplification of wolbachia endosymbiont, and a few have no hits. They are removed.

wolbachia_samples=bold[(grep(pattern = "Wolbachia", x = bold$Best.ID)),]$Query.ID
no_match_samples=bold[(grep(pattern = "No match", x = bold$Best.ID)),]$Query.ID

bold=bold[-which(bold$Query.ID %in% union(wolbachia_samples,no_match_samples)),]
bold$Best.ID=factor(bold$Best.ID)
dim(bold)
levels(bold$Best.ID)

Plot raw read composition for each sample

liste=split(bold, f = bold$sample)
tab_per_sample=lapply(X = liste, FUN = function(x){
  res=tapply(X = x$nreads, INDEX = x$Best.ID, FUN = sum)
  res[is.na(res)]=0  
  res=t(as.data.frame(res))
  sample_name=paste("sample", strsplit(as.character(x[1,5]), "_")[[1]][1], strsplit(as.character(x[1,5]), "_")[[1]][2])
  res=data.frame(sample_name, res)
  return(res)
})

table_per_sample=do.call(rbind.data.frame, tab_per_sample)
head(table_per_sample)

pdf(file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/BOLD_assignment.pdf")
par(mar=c(12,4,2,2))
for (i in 1:dim(table_per_sample)[1]){
  barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}
dev.off()
library("RColorBrewer")
mat=as.matrix(table_per_sample[,-1])
rownames(mat)=table_per_sample$sample_name
heatmap(mat, cexRow=0.6, margins=c(9,7),
        col=brewer.pal(9,"Blues"))

Phylogenetic analysis

Construct a wide CO1 database

The idea is to first perform a general blast using a large db (containing huge diversity, but possibly containing mis-labelled sequences). Then select the list of species found in the first 10 hits grabed from all individual read.

Using BOLD database (BINs)

Download BOLD bins corresponding to Pteromalidae Alysiinae Diapridae Figitidae drosophilinae

cd ~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/
cat Pteromalidae.fasta Alysiinae.fasta Diapridae.fasta Figitidae.fasta drosophilinae.fasta > CO1_all.fasta

remove some sequences with uninformative names

library(Biostrings)
seq=readDNAStringSet(filepath = "~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all.fasta")

# sequences in the CO1 database
length(seq)

# get ID and species names
names_seq=names(seq)
names_seq2=strsplit(x = names_seq, split="|", fixed=T)
names_seq_id=unlist(lapply(names_seq2, FUN=function(x){return(x[1])}))
names_seq_species=(lapply(names_seq2, FUN=function(x){return(x[2])}))

# identify the good species identifications
names_seq_species_OK=unlist(lapply(names_seq_species, FUN = function(x){
    cut=strsplit(x = x, split=" ")
    n_words=length(cut[[1]])
    if (n_words==2 & cut[[1]][2]!="sp.") {
      res=1
    } 
    else {
      res=0
    }
    return (res)
}))

names_seq_species=unlist(names_seq_species)
full_name=paste(names_seq_species, names_seq_id, sep = " ")
full_name=sub(pattern = " ", replacement = "_", x = full_name)

tab=data.frame(names_seq_id,names_seq_species, names_seq_species_OK, full_name)
dim(tab)
head(tab)


# subset sequences 
# modify seq names
names2=paste(tab$names_seq_species, tab$names_seq_id, sep = " ")
names2=sub(pattern = " ", replacement = "_", x = names2)
names(seq)=names2

seq2=seq[which(names(seq) %in% tab[tab$names_seq_species_OK==1,]$full_name)]
length(seq2)

writeXStringSet(x = seq2, filepath = "~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all_clean.fasta", format = "fasta")

Blast the CO1 sequences against the database

Construct the bd

cd ~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/
makeblastdb -in CO1_all_clean.fasta -dbtype 'nucl' -out CO1_all

Run the blasts

cd /Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET
DB=~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all

for file in *_short.fa; do
  echo $file
  blastn -num_threads 20 -query $file -outfmt 6 -max_target_seqs 10 -db $DB -out ../BLASTS/$file.blastn.tab
  cut -f2  ../BLASTS/$file.blastn.tab > ../BLASTS/$file.species.txt
done

# get the whole set of species found
cat *.txt > all_species.txt

Analyse the set of species

set of species

Construct a table with individual read assignement.

setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BLASTS/")

tab=read.table("all_species.txt", h=F)
head(tab)
dim(tab)

# number of "species"
length(unique(tab$V1))

# construct a table with sample info and the identified species for each assembled read:
ls=list.files(path = ".", pattern = "*.fa.blastn.tab")
ls=as.list(ls)

liste_res=lapply(X = ls, FUN = function(x){
  # etract sample informations
  sample_name=sub(x = x, pattern = ".fasta.blastn.tab", replacement = "")
  sample_name_split=lapply(strsplit(sample_name, split="_"), FUN=function(x){return(x[1:5])})[[1]]

    
  blast_result=read.table(file = x)
  blast_result_split=split(x=blast_result$V2, f=blast_result$V1)
  # return one species per read (because blast identify 2 hsp per read, and generates 2 lines per read)
  species=unlist(lapply(X = blast_result_split, FUN=function(x){
    return(x[1])
  }))
  # n reads
  n_reads=length(blast_result_split)
  # create a table with sample info and species identified (based on blast):
  tab1=data.frame(c(1:n_reads), as.data.frame(x = t(sample_name_split)), species)
  names(tab1)=c("read_number", "sample", "species", "location", "year", "month",  "best_blast_species")
  return(tab1)
})


table=do.call(rbind.data.frame, liste_res)
head(table)

construct a curated db

download 10 sequences per identified “species”.

# load the DECIPHER library in R
library(DECIPHER)

setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated")
species_under_score = levels(table$best_blast_species)
species_space= sub(pattern = "_", replacement = " ", x = species_under_score)
n = 10
for (i in 1:length(species_space)){
  
  print (paste0("SAMPLE : ",i," ",species_under_score[i]))
  # download n accession numbers for each species
  expression = paste0("cytochrome oxidase [TI] NOT 5prime[TI] NOT COII[TI] AND 500:800[SLEN] AND ", species_space[i] ," [Organism]")
  command1=paste0("/anaconda2/bin/python ~/python_functions/download_accession_numbers_from_ncbi.py nucleotide \"", expression, "\" ", n, " ", species_under_score[i], ".an.txt")
  system(command1)
  
  # download the corresponding sequences
  expression=paste0("/anaconda2/bin/python ~/python_functions/download_sequ_from_ncbi.py ", species_under_score[i], ".an.txt", " 20 fasta ", species_under_score[i], ".an.fa")
  system(expression)

  # align the sequences for each species 
  file = paste0(species_under_score[i], ".an.fa")
  file_out = paste0(species_under_score[i], ".an.aln")

  # load the sequences from the file
  seqs <- readDNAStringSet(file)

  seqs = RemoveGaps(myXStringSet = seqs)

  if (length(seqs)>1){
  # nucleotide sequences need to be in the same orientation
  # if they are not, then they can be reoriented (optional)
  seqs = OrientNucleotides(seqs)

  # perform the alignment
  aligned <- AlignSeqs(seqs)

  # write alignment
  # write the alignment to a new FASTA file
  writeXStringSet(aligned,
   file=file_out)
  }
}

Merge files and add a few sequences from BOLD (lacking from ncbi):

cd /Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/
cat *.an.fa > all.fa 

Modify names and remove the incorrect sequences

db2=readBStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all.fa")
length(db2)

# remove incorrect sequences
accessions=unlist(lapply(strsplit(names(db2), " "), FUN = function(x){
  return(x[1])
}))

#=db2[-which(x = accessions %in% to_remove)]
length(db2)

# shorten names
names=names(db2)
names[1:10]
names_split=strsplit(names, split=" ")
new_names=lapply(names_split, FUN = function(x){
  res=paste0(x[2], "_" , x[3], "_", x[1])
  return(res)
})

# remove unidentified species
new_names=unlist(new_names)
names(db2)=new_names
db2_clean=db2[-grep(pattern = "sp.", names(db2))]
db2_clean=db2_clean[-grep(pattern = "UNVERIFIED", names(db2_clean))]

length(db2_clean)


writeXStringSet(x = db2_clean, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.fa")

Orientation check for all sequences and new alignment

# load the DECIPHER library in R
library(DECIPHER)

# specify the path to the FASTA file (in quotes)
fas <- "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.fa"

# load the sequences from the file
# change "DNA" to "RNA" or "AA" if necessary
seqs <- readDNAStringSet(fas)

# look at some of the sequences (optional)
seqs

seqs = RemoveGaps(myXStringSet = seqs)

# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)

# perform the alignment
aligned <- AlignSeqs(seqs)

# view the alignment in a browser (optional)
#BrowseSeqs(aligned, highlight=0)

# write the alignment to a new FASTA file
writeXStringSet(aligned,
   file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.aln")

A few are not CO1 sequences and should be removed and replaced by correct sequences (subobscura and obscura):

setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/")

to_remove=read.table("to_remove.txt", h=F)
to_remove$V1

aligned2=aligned[-which(names(aligned) %in% to_remove$V1)]
# remove gaps
seqs = RemoveGaps(myXStringSet = aligned2)

to_add=readDNAStringSet("lacking.fas")
# only those called COI-5P are OK
to_add=to_add[grep(pattern = "COI-5P", x = names(to_add))]

seqs_to_Add = RemoveGaps(myXStringSet = to_add)
names=names(seqs_to_Add)
names2=lapply(strsplit(names, "|", fixed=T), FUN=function(x){
  sp=sub(pattern = " ", replacement = "_", x = x[2]) 
  an=x[4]
  res=paste0(sp, "_BOLD", an)
  return(res)
})
names(seqs_to_Add)=names2

# add to the previous
seqs=c(seqs, seqs_to_Add)

# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)

# perform the alignment
aligned2 <- AlignSeqs(seqs)


# write the alignment to a new FASTA file
writeXStringSet(aligned2,
   file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean2.aln")

A few species were filtered out (D. phalerata and Pachychrepoideus D supulchrella, and D. kuntzei):

to_add=readDNAStringSet(filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/to_add_again.fa")
to_add2=readDNAStringSet(filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/D.subpulchrella_toadd.fa")

aligned3=c(aligned2, to_add, to_add2)
# remove gaps
seqs = RemoveGaps(myXStringSet = aligned3)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)

# perform the alignment
aligned3 <- AlignSeqs(seqs)

# view the alignment in a browser (optional)
#BrowseSeqs(aligned3, highlight=0)

# write the alignment to a new FASTA file
writeXStringSet(aligned3,
   file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")

contruct a phylogeny

aligned3=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")

# calculate a NJ tree
d <- DistanceMatrix(aligned3,
   correction="Jukes-Cantor")

dend <- IdClusters(d,
   method="NJ",
   type="dendrogram",
   myXStringSet=aligned3, showPlot = T)

dend %>% 
  # Custom branches
  set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
  # Custom labels
  set("labels_col", "black") %>% set("labels_cex", 0.1) %>%
  plot(horiz=T)


dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db.pdf")

add our samples (obtained after clustering see above) to the phylogenetic tree

aligned3=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")

consensus_per_cluster_aln2=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")
# remove the  Wolbachia hits
consensus_per_cluster_aln2=consensus_per_cluster_aln2[-which(names(consensus_per_cluster_aln2) %in%  wolbachia_samples)]

######################
# n sequences in the db
length(aligned3)
# n sequences (consensus per cluster) in our sample
length(consensus_per_cluster_aln2)
full_aln=c(aligned3, reverseComplement(consensus_per_cluster_aln2))
length(full_aln)
#remove gaps
full_aln=RemoveGaps(full_aln)
# change orientation of our samples
#full_aln=OrientNucleotides(full_aln)
# align
full_aln=AlignSeqs(myXStringSet = full_aln)
#BrowseSeqs(full_aln)
# write alignment
writeXStringSet(full_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours.aln")

construct a NJ phylogeny

# calculate a NJ tree
d <- DistanceMatrix(full_aln,
   correction="Jukes-Cantor", processors = NULL, penalizeGapGapMatches=FALSE, penalizeGapLetterMatches=F)

dend <- IdClusters(d,
   method="NJ",
   type="dendrogram",
   myXStringSet=full_aln, showPlot = T)

colours=rep("black", length(full_aln))
colours[grep(labels(dend), pattern = "*_*_c[0-9]_n*")]="red"
dend %>% 
  # Custom branches
  set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
  # Custom labels
  set("labels_col", colours) %>% set("labels_cex", 0.1) %>%
  plot(horiz=T)


dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db_plus_our_samples_NJ.pdf")

From this first phylogeny, we identified a few strange reads. To check what they are I blasted them against nt: - 44_At_c2_n=1 is a non specific hit = rikketsia sequence (remove) - 54_Pachy_c2_n=1 is a non specific hit (nematode or something like that) - 54_Pachy_c3_n=2 is L. boulardi (contamination?) - 45_Trichopria_c1_n=1 is a non specific hit (virus or something like that)

I remove them from the phylogeny (except 54_Pachy_c3_n=2)

to_remove2=c("44_A.t_c2_n=142", "54_Pachy_c2_n=1", "45_Trichopria_c1_n=1")
full_aln2=full_aln[-which(names(full_aln) %in% to_remove2)]

# remove gaps
seqs = RemoveGaps(myXStringSet = full_aln2)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)

# perform the alignment
full_aln2 <- AlignSeqs(seqs)
# write it to disk
writeXStringSet(full_aln2, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/all_clean+ours_clean.aln")

# calculate a NJ tree
d <- DistanceMatrix(full_aln2,
   correction="Jukes-Cantor", processors = NULL, 
   penalizeGapGapMatches=FALSE, 
   penalizeGapLetterMatches=F)

dend <- IdClusters(d,
   method="ML",
   type="dendrogram",
   myXStringSet=full_aln2, showPlot = T, processors = NULL)

colours=rep("black", length(full_aln2))
colours[grep(labels(dend), pattern = "*_*_c[0-9]_n*")]="red"
dend %>% 
  # Custom branches
  set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
  # Custom labels
  set("labels_col", colours) %>% set("labels_cex", 0.1) %>%
  plot(horiz=T)


dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db_plus_our_samples_ML.pdf")

I also ran a phylogenetic reconstruction using seaview on the same alignement (all_clean+ours_clean.aln) to obtain aLRT support for branches.

Now based on the phylogeny, I modified the assignations obtained from BOLD in the IDEngine_Results_Summary_corrected.txt table.

bold_modified=read.table("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/IDEngine_Results_Summary_corrected.txt", h=T , sep="\t")
head(bold_modified)
##            Query.ID                 Best.ID            Search.DB Top..
## 1 1_D.mel_c1_n=3031 Drosophila melanogaster COI SPECIES DATABASE 99.81
## 2     10_L.b_c1_n=1    Leptopilina boulardi COI SPECIES DATABASE 99.21
## 3     10_L.b_c2_n=1    Leptopilina boulardi                 ncbi    NA
## 4   10_L.b_c3_n=944    Leptopilina boulardi COI SPECIES DATABASE 99.81
## 5   10_L.b_c4_n=401    Leptopilina boulardi COI SPECIES DATABASE 99.77
## 6  10_L.b_c5_n=1049    Leptopilina boulardi COI SPECIES DATABASE 99.81
##   Low..
## 1 99.74
## 2 85.43
## 3    NA
## 4 85.92
## 5 85.90
## 6 86.02
# split first column
names=strsplit(as.character(bold_modified$Query.ID), "_")
names=lapply(X = names, FUN=function(x){
  sample=x[1]
  species=x[2]
  cluster=as.numeric(sub(pattern = "c", replacement = "", x = x[3]))
  nreads=as.numeric(sub(pattern = "n=", replacement = "", x = x[4]))
  res=data.frame(sample, species, cluster, nreads)
  return(res)
})
names=do.call(rbind.data.frame, names)

bold_modified=data.frame(names, bold_modified)
head(bold_modified)
##   sample species cluster nreads          Query.ID                 Best.ID
## 1      1   D.mel       1   3031 1_D.mel_c1_n=3031 Drosophila melanogaster
## 2     10     L.b       1      1     10_L.b_c1_n=1    Leptopilina boulardi
## 3     10     L.b       2      1     10_L.b_c2_n=1    Leptopilina boulardi
## 4     10     L.b       3    944   10_L.b_c3_n=944    Leptopilina boulardi
## 5     10     L.b       4    401   10_L.b_c4_n=401    Leptopilina boulardi
## 6     10     L.b       5   1049  10_L.b_c5_n=1049    Leptopilina boulardi
##              Search.DB Top.. Low..
## 1 COI SPECIES DATABASE 99.81 99.74
## 2 COI SPECIES DATABASE 99.21 85.43
## 3                 ncbi    NA    NA
## 4 COI SPECIES DATABASE 99.81 85.92
## 5 COI SPECIES DATABASE 99.77 85.90
## 6 COI SPECIES DATABASE 99.81 86.02
#species identified by BOLD:

levels(bold_modified$Best.ID)
##  [1] "Asobara rufescens"                          
##  [2] "Asobara tabida"                             
##  [3] "Chymomyza amoena"                           
##  [4] "Drosophila hydei"                           
##  [5] "Drosophila immigrans"                       
##  [6] "Drosophila kuntzei"                         
##  [7] "Drosophila melanogaster"                    
##  [8] "Drosophila obscura"                         
##  [9] "Drosophila simulans"                        
## [10] "Drosophila subobscura"                      
## [11] "Drosophila suzukii"                         
## [12] "Leptopilina boulardi"                       
## [13] "Leptopilina heterotoma"                     
## [14] "not specific (nematode?)"                   
## [15] "not specific (Rickettsia)"                  
## [16] "not specific (virus)"                       
## [17] "Pteromalidae sp."                           
## [18] "Trichopria sp. 1 TAS-2012"                  
## [19] "Wolbachia endosymbiont of Blastophaga sp. 1"
## [20] "Wolbachia endosymbiont of Cerapachys undet" 
## [21] "Wolbachia endosymbiont of Microgastrinae"   
## [22] "Wolbachia sp. Cerapachys L_MG10_ASANV577-09"

Remove non specific amplifications (symbionts and others):

wolbachia_samples=bold_modified[(grep(pattern = "Wolbachia", x = bold_modified$Best.ID)),]$Query.ID
not_specific=bold_modified[(grep(pattern = "not specific", x = bold_modified$Best.ID)),]$Query.ID

bold_modified=bold_modified[-which(bold_modified$Query.ID %in% union(wolbachia_samples,not_specific)),]
bold_modified$Best.ID=factor(bold_modified$Best.ID)
dim(bold_modified)
## [1] 101   9
levels(bold_modified$Best.ID)
##  [1] "Asobara rufescens"         "Asobara tabida"           
##  [3] "Chymomyza amoena"          "Drosophila hydei"         
##  [5] "Drosophila immigrans"      "Drosophila kuntzei"       
##  [7] "Drosophila melanogaster"   "Drosophila obscura"       
##  [9] "Drosophila simulans"       "Drosophila subobscura"    
## [11] "Drosophila suzukii"        "Leptopilina boulardi"     
## [13] "Leptopilina heterotoma"    "Pteromalidae sp."         
## [15] "Trichopria sp. 1 TAS-2012"

Plot read composition for each sample :

liste=split(bold_modified, f = bold_modified$sample)
tab_per_sample=lapply(X = liste, FUN = function(x){
  res=tapply(X = x$nreads, INDEX = x$Best.ID, FUN = sum)
  res[is.na(res)]=0  
  res=t(as.data.frame(res))
  sample_name=paste("sample", strsplit(as.character(x[1,5]), "_")[[1]][1], strsplit(as.character(x[1,5]), "_")[[1]][2])
  res=data.frame(sample_name, res)
  return(res)
})

table_per_sample=do.call(rbind.data.frame, tab_per_sample)
head(table_per_sample)
##        sample_name Asobara.rufescens Asobara.tabida Chymomyza.amoena
## 1   sample 1 D.mel                 0              0                0
## 10   sample 10 L.b                 0              0                0
## 11 sample 11 D.mel                 0              0                0
## 12 sample 12 D.sim                 0              0                0
## 13  sample 13 D.im                 0              0                0
## 14 sample 14 D.sub                 0              0                0
##    Drosophila.hydei Drosophila.immigrans Drosophila.kuntzei
## 1                 0                    0                  0
## 10                0                    0                  0
## 11                0                    0                  0
## 12                0                    0                  0
## 13                0                 1594                  0
## 14                0                    0                  0
##    Drosophila.melanogaster Drosophila.obscura Drosophila.simulans
## 1                     3031                  0                   0
## 10                       0                  0                   0
## 11                    1606                  0                   0
## 12                       0                  0                 628
## 13                       0                  0                   0
## 14                       0                141                   0
##    Drosophila.subobscura Drosophila.suzukii Leptopilina.boulardi
## 1                      0                  0                    0
## 10                     0                  0                 2397
## 11                     0                  0                    0
## 12                     0                  0                    0
## 13                     0                  0                    0
## 14                   166                  0                    0
##    Leptopilina.heterotoma Pteromalidae.sp. Trichopria.sp..1.TAS.2012
## 1                       0                0                         0
## 10                      0                0                         0
## 11                      0                0                         0
## 12                      0                0                         0
## 13                      0                0                         0
## 14                      0                0                         0
pdf(file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/BOLD_assignment_corrected.pdf")
par(mar=c(12,4,2,2))
for (i in 1:dim(table_per_sample)[1]){
  barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}
dev.off()
## quartz_off_screen 
##                 2
for (i in 1:dim(table_per_sample)[1]){
  barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}

library("RColorBrewer")

mat=as.matrix(table_per_sample[,-1])
rownames(mat)=table_per_sample$sample_name

# reorder rows and columns
sort=unlist(lapply(strsplit(rownames(mat), " "), FUN=function(x){
  return(x[3])
}))
d=data.frame(1:length(sort), sort)
names(d)[1]="index"
d=d[order(d$sort, decreasing = T),]

mat=mat[d$index,sort(colnames(mat))]
heatmap(mat, cexRow=0.6, margins=c(9,7),
        col=brewer.pal(9,"Blues"), Rowv=NA, Colv = NA)

# write matrix to the disk
write.table(mat, file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/Assignment_table.txt", row.names = T, col.names = T, quote=F)

Finally, the phylogeny with branch support (alignement by muscle, PhyML as implemented in seaview) is constructed and imported :

library(ggtree)
## Warning: package 'ggtree' was built under R version 3.5.2
## ggtree v1.14.6  For help: https://guangchuangyu.github.io/software/ggtree
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
## 
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, accepted. doi: 10.1093/molbev/msy194
## 
## Attaching package: 'ggtree'
## The following object is masked _by_ '.GlobalEnv':
## 
##     multiplot
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
## Loading required package: ggplot2
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
## The following object is masked from 'package:ggtree':
## 
##     ggsave
tree=read.tree("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours_clean-PhyML_tree")

tip=tree$tip.label
colour_tips=rep("blue", length(tree$tip.label))
colour_tips[grep(x = colour_tips, pattern = "*_*_c[0-9]_n*")]="red"
dd=data.frame(tree$tip.label, colour_tips)

aLRT=tree$node.label
aLRT[which(aLRT<0.7)]=""

p2 =ggplot(tree, aes(x, y)) + geom_tree(layout = "rectangular") + 
  theme_tree() + 
  geom_treescale() + 
  geom_tiplab(size = 0.4, aes(colour=rep("red", 871))) + 
  geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.4, hjust=0) 


p2 

ggsave(p2,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML.pdf")
## Saving 7 x 5 in image
### without the small sequence 
# see https://github.com/GuangchuangYu/plotTree-ggtree/blob/master/01_basic_strain_info.R
tree=read.tree("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours_clean-PhyML_tree2")

tip=tree$tip.label
colour_tips=rep("blue", length(tree$tip.label))
colour_tips[grep(x = tip, pattern = "*_*_c[0-9]_n*")]="red"
dd=data.frame(tree$tip.label, colour_tips)
rownames(dd)=tree$tip.label

aLRT=tree$node.label
aLRT[which(aLRT<0.7)]=""

# show node numbers
p <- ggtree(tree, color="grey",linetype="dotted") %<+% dd +
  geom_tree(layout = "rectangular") + 
  theme_tree() + 
  geom_treescale(x = 0, y = 400) +
  geom_tiplab(aes(color=colour_tips), size=0.8) +
  #geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.2, hjust=-0.4) +
  geom_text2(aes(subset=!isTip, label=node), hjust=-0.3, size=0.6)

p

ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2_node_numbers.pdf", width = 10, height = 15)
   

# highlight the clades that are of interests:
# example Chimomiza node 686
viewClade(p+geom_tiplab(), node=462)

# our clades of interest :
species_highlight=c(686, # chymomiza
                    838, #D. mel
                    859, # D.s im
                    818, # D. suz
                    806, # D. hydei
                    794, # D. kuntzei
                    630, # D. sub
                    625, # D. obscura
                    611, # D. im
                    573, # A.r
                    566, # A.t
                    540, #PAchy
                    485, # Tricho
                    437, #L. he
                    462) #L.b

#species_highlight=c(686, 831, 859, 806, 794, 624, 611, 640, 566, 573, 540, 462, 437, 485, 817)

tree2 = groupClade(tree, .node = species_highlight)
#ggtree(tree, aes(color=group, linetype=group))

p <- ggtree(tree2, aes(color=group)) %<+% dd +
  geom_tree(layout = "rectangular") + 
  theme_tree() + 
  geom_treescale(x = 0, y = 400)+
  geom_tiplab(aes(color=colour_tips), size=0.8) + scale_color_manual(values = c("grey", rep("black", 14), "blue", "red" )) +
  geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.6, hjust=-0.4) 
p

ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2a.pdf", width = 10, height = 15)



alpha=0.2
extend=0.05

plot_grid(p <- ggtree(tree, color="grey") %<+% dd + 
  geom_tree(layout = "rectangular") + 
  theme_tree() + 
  geom_treescale(x = 0, y = 400, offset = 2) + 
  geom_tiplab(aes(color=colour_tips), size=0.8) + scale_color_manual(values = c("gray45", "firebrick1")) +
  geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.6, hjust=-0.4) +
  geom_hilight(node = species_highlight[1], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[2], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[3], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[4], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[5], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[6], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[7], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[8], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[9], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[10], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[11], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[12], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[13], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[14], "steelblue", extend = extend, alpha = alpha) +
  geom_hilight(node = species_highlight[15], "steelblue", extend = extend, alpha = alpha))

ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2b.pdf", width = 10, height = 15)